import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics.pairwise import haversine_distances
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
warnings.filterwarnings("ignore")
%matplotlib inline
%config InlineBackend.figure_format='retina'
pd.options.mode.chained_assignment = None
user_routes = pd.read_csv("../input/kv-big-data/user_routes_2019_2020.csv")
districts_info = pd.read_csv("../input/kv-big-data/districts_info.csv", delimiter="|")
df = user_routes.merge(districts_info, left_on='start_area_id', right_on='area_id')\
.merge(districts_info, left_on="finish_area_id", right_on="area_id")
df[["hmonth", "start_time", "finish_time"]] = df[["hmonth", "start_time", "finish_time"]].astype("datetime64")
df["year"] = df.hmonth.dt.year
df_2019 = df[df["year"] == 2019]
df_2020 = df[df["year"] == 2020]
def preprocess_data(df):
# feature 1 - sum of areas
repeated_finish_id = df.groupby("user_id")["finish_area_id"].count().reset_index("user_id")
repeated_start_id = df.groupby("user_id")['start_area_id'].count().reset_index("user_id")
sum_of_areas = np.array(repeated_start_id)[:, 1] + np.array(repeated_finish_id)[:, 1]
# feature 2
df["duration"] = abs(df.finish_time.dt.hour - df.start_time.dt.hour)
duration_sum = np.array(df.groupby("user_id")["duration"].sum().reset_index("user_id"))[:, 1]
# feature 3
df["travel_distance"] = df.apply(lambda df: haversine_distances(
[np.radians(np.array([df['centroid_lat_x'], df['centroid_lat_y']])),
np.radians(np.array([df['centroid_lon_x'], df['centroid_lon_y']]))])[0, 1] * 6371,
axis=1)
tr_distance = np.array(df.groupby("user_id")['travel_distance'].sum().reset_index("user_id"))[:, 1]
return sum_of_areas, duration_sum, tr_distance
areas_2019, duration_2019, distance_2019 = preprocess_data(df_2019)
fig = make_subplots(rows=3, cols=1,
subplot_titles=("Sum of visited areas",
"Total travel duration per user",
"Total covered distance per user"))
fig.append_trace(go.Histogram(x=areas_2019),
row=1, col=1)
fig.append_trace(go.Histogram(x=duration_2019),
row=2, col=1)
fig.append_trace(go.Histogram(x=distance_2019),
row=3, col=1)
fig.update_layout(height=800, width=800,
title_text="Distribution of the Features")
fig.show()
df["neighbor_area_idx_x"][0]
'"[\\"420\\", \\"548\\", \\"415\\", \\"549\\", \\"417\\", \\"451\\"]"'
train_df_2019 = pd.DataFrame({"areas_sum": areas_2019,
"duration": duration_2019,
"distance": distance_2019})
train_df_2019[train_df_2019.columns] = train_df_2019[train_df_2019.columns].apply(lambda x: np.log1p(x))
fig = make_subplots(rows=3, cols=1,
subplot_titles=("Sum of visited areas",
"Total travel duration per user",
"Total covered distance per user"))
fig.append_trace(go.Histogram(x=train_df_2019.areas_sum),
row=1, col=1)
fig.append_trace(go.Histogram(x=train_df_2019.duration),
row=2, col=1)
fig.append_trace(go.Histogram(x=train_df_2019.distance),
row=3, col=1)
fig.update_layout(height=800, width=800,
title_text="Distribution of the Features after Logarithm Transformation")
fig.show()
sse_loss = []
for n in range(1, 11):
kmeans = KMeans(n_clusters=n)
kmeans.fit(train_df_2019)
sse_loss.append(np.sqrt(kmeans.inertia_))
fig = go.Figure()
fig.add_trace(go.Scatter(x=[i for i in range(1, 11)],
y=sse_loss,
mode='lines+markers'))
fig.update_layout(xaxis = dict(
tickmode = 'linear',
tick0 = 1,
dtick = 1),
title_text="SSE loss VS N clusters",
xaxis_title="Number of clusters",
yaxis_title="SSE loss")
fig.show()
kmeans_model = KMeans(n_clusters=3)
kmeans_model.fit_predict(train_df_2019)
cluster_centers = kmeans_model.cluster_centers_
train_df_2019_res = train_df_2019.copy()
train_df_2019_res["clusters"] = kmeans_model.labels_
train_df_2019_res["cluster_name"] = train_df_2019_res["clusters"].astype(str)
fig = px.scatter_3d(df_2020_preds_res_s,
x="areas_sum",
y="duration",
z="distance",
color='cluster_name',
hover_data=["areas_sum",
"duration",
"distance"],
category_orders = {"Cluster":
["0", "1", "2"]
})
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()
card_df = pd.DataFrame(train_df_2019_res.cluster_name.value_counts().reset_index())
# card_df = card_df.replace(["0", "1"], ["1", "0"])
card_df.rename(columns={"index": "User Groups",
"cluster_name": "User Group Magnitude"},
inplace=True)
card_df
| User Groups | User Group Magnitude | |
|---|---|---|
| 0 | 0 | 20865 |
| 1 | 1 | 15170 |
| 2 | 2 | 13965 |
fig = px.bar(card_df, x="User Groups",
y="User Group Magnitude",
color = "User Groups",
color_discrete_sequence=['#00CC96', '#636EFA', '#EF553B'],
category_orders = {"User Groups": ["0", "1", "2"]})
fig.update_layout(xaxis = dict(
tickmode = 'linear',
tick0 = 1,
dtick = 1),
yaxis = dict(
tickmode = 'linear',
tick0 = 1000,
dtick = 1000))
fig.show()
areas_2020, duration_2020, distance_2020 = preprocess_data(df_2020)
df_2020_preds = pd.DataFrame({"areas_sum": areas_2020,
"duration": duration_2020,
"distance": distance_2020})
df_2020_preds[df_2020_preds.columns] = df_2020_preds[df_2020_preds.columns].apply(lambda x: np.log1p(x))
# kmeans_model = KMeans(n_clusters=3)
labels = kmeans_model.predict(df_2020_preds)
cluster_centers = kmeans_model.cluster_centers_
df_2020_preds_res = df_2020_preds.copy()
df_2020_preds_res["clusters"] = labels
df_2020_preds_res["cluster_name"] = df_2020_preds_res["clusters"].astype(str)
df_2020_preds_res_s = df_2020_preds_res.copy()
df_2020_preds_res_s["cluster_name"] = df_2020_preds_res_s['cluster_name'].replace(['0', '1'], ['1', '0'])
fig = px.scatter_3d(df_2020_preds_res_s,
x="areas_sum",
y="duration",
z="distance",
color='cluster_name',
hover_data=["areas_sum",
"duration",
"distance"],
category_orders = {"Cluster":
["0", "1", "2"]
})
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()
card_df_2020 = pd.DataFrame(df_2020_preds_res_s.cluster_name.value_counts().reset_index())
card_df_2020.rename(columns={"index": "User Groups",
"cluster_name": "User Group Magnitude"},
inplace=True)
card_df_2020
| User Groups | User Group Magnitude | |
|---|---|---|
| 0 | 1 | 19027 |
| 1 | 0 | 18243 |
| 2 | 2 | 12728 |
fig = px.bar(card_df_2020, x="User Groups",
y="User Group Magnitude",
color = "User Groups",
color_discrete_sequence=['#00CC96', '#636EFA', '#EF553B'],
category_orders = {"User Groups": ["0", "1", "2"]})
fig.update_layout(xaxis = dict(
tickmode = 'linear',
tick0 = 1,
dtick = 1),
yaxis = dict(
tickmode = 'linear',
tick0 = 1000,
dtick = 1000))
fig.show()
user_routes_Zhytomyr = pd.read_csv("../input/kv-big-data/user_routes_Zhytomyr.csv")
districts_info = pd.read_csv("../input/kv-big-data/districts_info.csv", delimiter="|")
df = user_routes_Zhytomyr.merge(districts_info, left_on='start_area_id', right_on='area_id')\
.merge(districts_info, left_on="finish_area_id", right_on="area_id")
df[["hmonth", "start_time", "finish_time"]] = df[["hmonth", "start_time", "finish_time"]].astype("datetime64")
df["year"] = df.hmonth.dt.year
def preprocess_data(df):
# feature 1 - sum of areas
repeated_finish_id = df.groupby("user_id")["finish_area_id"].count().reset_index("user_id")
repeated_start_id = df.groupby("user_id")['start_area_id'].count().reset_index("user_id")
sum_of_areas = np.array(repeated_start_id)[:, 1] + np.array(repeated_finish_id)[:, 1]
# feature 2
df["duration"] = abs(df.finish_time.dt.hour - df.start_time.dt.hour)
duration_sum = np.array(df.groupby("user_id")["duration"].sum().reset_index("user_id"))[:, 1]
# feature 3
df["travel_distance"] = df.apply(lambda df: haversine_distances(
[np.radians(np.array([df['centroid_lat_x'], df['centroid_lat_y']])),
np.radians(np.array([df['centroid_lon_x'], df['centroid_lon_y']]))])[0, 1] * 6371,
axis=1)
tr_distance = np.array(df.groupby("user_id")['travel_distance'].sum().reset_index("user_id"))[:, 1]
return sum_of_areas, duration_sum, tr_distance
areas_Zhytomyr, duration_Zhytomyr, distance_Zhytomyr = preprocess_data(df)
fig = make_subplots(rows=3, cols=1,
subplot_titles=("Sum of visited areas",
"Total travel duration per user",
"Total covered distance per user"))
fig.append_trace(go.Histogram(x=areas_Zhytomyr),
row=1, col=1)
fig.append_trace(go.Histogram(x=duration_Zhytomyr),
row=2, col=1)
fig.append_trace(go.Histogram(x=distance_Zhytomyr),
row=3, col=1)
fig.update_layout(height=800, width=800,
title_text="Distribution of the Features")
fig.show()
df_Zhytomyr = pd.DataFrame({"areas_sum": areas_Zhytomyr,
"duration": duration_Zhytomyr,
"distance": distance_Zhytomyr})
df_Zhytomyr[df_Zhytomyr.columns] = df_Zhytomyr[df_Zhytomyr.columns].apply(lambda x: np.log1p(x))
fig = make_subplots(rows=3, cols=1,
subplot_titles=("Sum of visited areas",
"Total travel duration per user",
"Total covered distance per user"))
fig.append_trace(go.Histogram(x=df_Zhytomyr.areas_sum),
row=1, col=1)
fig.append_trace(go.Histogram(x=df_Zhytomyr.duration),
row=2, col=1)
fig.append_trace(go.Histogram(x=df_Zhytomyr.distance),
row=3, col=1)
fig.update_layout(height=800, width=800,
title_text="Distribution of the Features after Logarithm Transformation")
fig.show()
labels_Zhytomyr = kmeans_model.predict(df_Zhytomyr)
cluster_centers = kmeans_model.cluster_centers_
df_Zhytomyr_preds = df_Zhytomyr.copy()
df_Zhytomyr_preds["clusters"] = labels_Zhytomyr
df_Zhytomyr_preds["cluster_name"] = df_Zhytomyr_preds["clusters"].astype(str)
df_Zhytomyr_preds_s = df_Zhytomyr_preds.copy()
df_Zhytomyr_preds_s["cluster_name"] = df_Zhytomyr_preds_s['cluster_name'].replace(['0', '1'], ['1', '0'])
fig = px.scatter_3d(df_Zhytomyr_preds_s,
x="areas_sum",
y="duration",
z="distance",
color='cluster_name',
color_discrete_sequence=['#EF553B', '#636EFA', '#00CC96'],
hover_data=["areas_sum",
"duration",
"distance"],
category_orders = {"Cluster":
["0", "1", "2"]
})
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()
card_Zhytomyr = pd.DataFrame(df_Zhytomyr_preds_s.cluster_name.value_counts().reset_index())
card_Zhytomyr.rename(columns={"index": "User Groups",
"cluster_name": "User Group Magnitude"},
inplace=True)
card_Zhytomyr
| User Groups | User Group Magnitude | |
|---|---|---|
| 0 | 1 | 2193 |
| 1 | 2 | 1946 |
| 2 | 0 | 1364 |
fig = px.bar(card_Zhytomyr, x="User Groups",
y="User Group Magnitude",
color = "User Groups",
color_discrete_sequence=['#00CC96', '#636EFA', '#EF553B'],
category_orders = {"User Groups": ["0", "1", "2"]})
fig.update_layout(xaxis = dict(
tickmode = 'linear',
tick0 = 1,
dtick = 1),
yaxis = dict(
tickmode = 'linear',
tick0 = 1000,
dtick = 1000))
fig.show()
def get_labels_percent(df):
all_users = df['User Group Magnitude'].sum()
df['percent'] = 100 * df["User Group Magnitude"] / all_users
df = df.sort_values(by='User Groups')
df["User Groups"] = df["User Groups"].replace(['0', '1', '2'], ["Cluster 1", "Cluster 2", "Cluster 3"])
return df
def get_diff(df19, df20, dfZhytomyr):
res = pd.DataFrame()
res['User Groups'] = ['Cluster 1', "Cluster 2", 'Cluster 3']
res['20-19'] = np.round(df20['percent'].values - df19['percent'].values, 2).astype('str')
res['Zhytomyr-19'] = np.round(dfZhytomyr['percent'].values - df19['percent'].values, 2).astype('str')
res['20-19'] = res['20-19'].apply(lambda x: x + "%" if float(x) < 0 else '+' + x + '%')
res['Zhytomyr-19'] = res['Zhytomyr-19'].apply(lambda x: x + "%" if float(x) < 0 else '+' + x + '%')
# res['Zhytomyr-20'] = round(df20['percent'].values - dfZhytomyr['percent'].values, 2).astype('str').apply(lambda x: x + '%')
return res
labels_percent_2019 = get_labels_percent(card_df)
labels_percent_2020 = get_labels_percent(card_df_2020)
labels_percent_Zhytomyr = get_labels_percent(card_Zhytomyr)
df_diff = get_diff(labels_percent_2019, labels_percent_2020, labels_percent_Zhytomyr)
fig = make_subplots(rows=1, cols=3, specs=[[{"type": "pie"}, {"type": "pie"}, {"type": "pie"}]])
fig.add_trace(go.Pie(labels=labels_percent_2019["User Groups"],
values=labels_percent_2019['percent'],
marker={"colors": ['#00CC96', '#636EFA', '#EF553B']},
title_text='2019 user clusters'),
1, 1)
fig.add_trace(go.Pie(labels=labels_percent_2020["User Groups"],
values=labels_percent_2020['percent'],
marker={"colors": ['#00CC96', '#636EFA', '#EF553B']},
title_text='2020 user clusters',
),
1, 2)
fig.add_trace(go.Pie(labels=labels_percent_Zhytomyr["User Groups"],
values=labels_percent_Zhytomyr['percent'],
marker={"colors": ['#00CC96', '#636EFA', '#EF553B']},
title_text='Zhytomyr user clusters'),
1, 3)
fig.update_layout(
title='Comparing results of segmentation',
legend_title='Clusters',
font=dict(size=14)
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
# fig2 = make_subplots(rows=3, cols=2, specs=[[{"type": "pie"}, {"type": "pie"}]])
# fig2.add_trace(go.Pie(labels=))
def annotation_new(x, y, text, color):
return fig.add_annotation(
go.layout.Annotation(x=x, y=y, text=text,
font=dict(color=color, size=14),
showarrow=True, arrowhead=1)
)
fig = go.Figure(data=[
go.Bar(x=labels_percent_2019['User Groups'], y=labels_percent_2019['percent'],
marker={"color": '#5e03fc'}, name='2019 year'),
go.Bar(x=labels_percent_2020['User Groups'], y=labels_percent_2020['percent'],
marker={"color": '#8f9fb8'}, name='2020 year'),
go.Bar(x=labels_percent_Zhytomyr['User Groups'], y=labels_percent_Zhytomyr['percent'],
marker={"color": "#a3a28c"}, name='Zhytomyr data'),
])
fig.update_layout(barmode='group',
title='Comparing results with 2019 year')
annotation_new(0.02, 36.48, df_diff['20-19'].values[0], 'red')
annotation_new(0.29, 25, df_diff['Zhytomyr-19'].values[0], 'red')
annotation_new(1, 38.05, df_diff['20-19'].values[1], 'green')
annotation_new(1.28, 39.85, df_diff['Zhytomyr-19'].values[1], 'green')
annotation_new(2, 25.46, df_diff['20-19'].values[2], 'red')
annotation_new(2.28, 35.36, df_diff['Zhytomyr-19'].values[2], 'green')
fig.show()